#!/usr/bin/env python3

"""
This is a program that will insert Git submodules according to SVN externals
definitions (svn:externals properties) from the original Subversion repository
throughout the history.

Information about the externals is obtained from the ".gitsvnextmodules" file
created during SVN-to-Git conversion by SubGit (https://subgit.com/). Its
config option "translate.externals=true" had to be used therefore.

Actual modifications:
- Insert gitlinks (mode 160000) into the tree.
- Add .gitmodules file with relevant sections.
- Remove sections converted to submodules from .gitsvnextmodules file
  and delete it if empty.

.gitsvnextmodules example:
[submodule "somedir/extdir"]
	path = somedir/extdir
	owner = somedir
	url = https://svn.example.com/somesvnrepo/trunk
	revision = 1234
	branch = /
	fetch = :refs/remotes/git-svn
	remote = svn
	type = dir

Resulting addition in "somedir" tree (cat-file pretty-print format):
160000 commit 1234123412341234123412341234123412341234	extdir

Resulting .gitmodules entry:
[submodule "somedir/extdir"]
	path = somedir/extdir
	url = https://git.example.com/somegitrepo.git

SVN-to-Git mapping file:
Can be created from SubGit's "refs/svn/map".
One line per mapping in following format:
<svn url> TAB <svn rev> TAB <git url> TAB <git commit> TAB <state>
- Leading '#' can be used for comments.
- <svn url> must not contain a trailing slash.
- <state> has to be "commit" to be usable, but can be "missing" if <git commit>
  does not exist in the repository anymore. Adopted from git-cat-file output.
Example:
https://svn.example.com/somesvnrepo/trunk	1234	https://git.example.com/somegitrepo.git	1234123412341234123412341234123412341234	commit

Features:
- Repeatedly added/removed externals will be handled properly.
- Externals replaced by directly added files and vice versa will be handled
  properly.

Caveats:
- This script must NOT be run repeatedly. A second invocation would lead to a
  different result in case the externals could only be converted partially.
- Inconsistent SVN repositories (with failing checkout) not handled, i.e.
  - normal directory and external with the same path
  - external path not existing for the given revision
- No attention was paid to non-ASCII and special characters in gitlink paths,
  might cause problems.
- There is no error handling for mandatory options missing in .gitsvnextmodules
  file. The script would crash in case of such buggy files, but that shouldn't
  happen in practice.

TODO:
- Add external files directly.
- Alternatively add external directories directly instead of using a submodule.
"""

"""
Please see the
  ***** API BACKWARD COMPATIBILITY CAVEAT *****
near the top of git-filter-repo.
"""

import argparse
import os
import sys
import shutil
import subprocess
import configparser
from urllib.parse import urlsplit

try:
  import git_filter_repo as fr
except ImportError:
  raise SystemExit("Error: Couldn't find git_filter_repo.py.  Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?")

svn_root_url = ""
svn_git_mappings = []

def parse_args():
  """
  Parse and return arguments for this script.

  Also do some argument sanity checks and adaptions.
  """
  parser = argparse.ArgumentParser(
      description="Add Git submodules according to svn:externals from .gitsvnextmodules. "
                  "As preparation for this conversion process, an analysis can be performed.")

  parser.add_argument('--force', '-f', action='store_true',
      help="Rewrite repository history even if the current repo does not "
           "look like a fresh clone.")
  parser.add_argument('--refs', nargs='+',
      help="Limit history rewriting to the specified refs. Option is directly "
           "forwarded to git-filter-repo, see there for details and caveats. "
           "Use for debugging purposes only!")
  parser.add_argument('--svn-root-url',
      help="Root URL of the corresponding SVN repository, "
           "needed for conversion of relative to absolute external URLs.")

  analysis = parser.add_argument_group(title="Analysis")
  analysis.add_argument('--analyze', action='store_true',
      help="Analyze repository history and create auxiliary files for conversion process.")
  analysis.add_argument('--report-dir', type=os.fsencode,
      help="Directory to write report, defaults to GIT_DIR/filter-repo/svnexternals, "
           "refuses to run if exists, --force delete existing dir first.")

  conversion = parser.add_argument_group(title="Conversion")
  conversion.add_argument('--svn-git-mapfiles', type=os.fsencode, nargs='+', metavar='MAPFILE',
      help="Files with SVN-to-Git revision mappings for SVN externals conversion.")

  args = parser.parse_args()

  if args.analyze and args.svn_git_mapfiles:
    raise SystemExit("Error: --svn-git-mapfiles makes no sense with --analyze.")

  if not args.analyze and not args.svn_git_mapfiles:
    raise SystemExit("Error: --svn-git-mapfiles is required for the conversion process.")

  return args

def read_mappings(mapfiles):
  """
  Read files with SVN-to-Git mappings and return a list of mappings from it.
  """
  mappings = []
  for mapfile in mapfiles:
    with open(mapfile, "rb") as f:
      for line in f:
        line = line.rstrip(b'\r\n')

        # Skip blank and comment lines
        if not line or line.startswith(b'#'):
          continue

        # Convert to string for use with configparser later
        line = line.decode()

        # Parse the line
        fields = line.split('\t', 4)
        mapping = {'svn_url': fields[0],
                   'svn_rev': int(fields[1]),
                   'git_url': fields[2],
                   'git_commit': fields[3],
                   'state': fields[4]}

        mappings.append(mapping)
  return mappings

cat_file_process = None
def parse_config(blob_id):
  """
  Create a configparser object for a .gitsvnextmodules/.gitmodules file from
  its blob ID.
  """
  parsed_config = configparser.ConfigParser()

  if blob_id is not None:
    # Get the blob contents
    cat_file_process.stdin.write(blob_id + b'\n')
    cat_file_process.stdin.flush()
    objhash, objtype, objsize = cat_file_process.stdout.readline().split()
    contents_plus_newline = cat_file_process.stdout.read(int(objsize)+1)

    # Parse it
    parsed_config.read_string(contents_plus_newline.decode())

  return parsed_config

def create_blob(parsed_config):
  """
  Create a filter-repo blob object from a .gitsvnextmodules/.gitmodules
  configparser object according to Git config style.
  """
  lines = []
  for sec in parsed_config.sections():
    lines.append("[" + sec + "]\n")
    for opt in parsed_config.options(sec):
      lines.append("\t" + opt + " = " + parsed_config[sec][opt] + "\n")

  return fr.Blob(''.join(lines).encode())

def get_git_url(svn_url):
  """
  Get the Git URL for a corresponding SVN URL.
  """
  for entry in svn_git_mappings:
    if entry['svn_url'] == svn_url:
      return entry['git_url']
  else:
    return None

def get_git_commit_hash(svn_url, svn_rev):
  """
  Get the Git commit hash for its corresponding SVN URL+revision.

  The mapping is not restricted to the exact revision, but also uses the next
  lower revision found. Needed when the revision was set to that of the root
  URL instead of to that of the specific subdirectory (e.g. trunk). TortoiseSVN
  behaves so when setting the external to HEAD.
  """
  ent = None
  rev = 0

  for entry in svn_git_mappings:
    if (entry['svn_url'] == svn_url
          and entry['svn_rev'] <= svn_rev
          and entry['svn_rev'] > rev):
      ent = entry
      rev = entry['svn_rev']

  if ent is not None and ent['state'] == "commit":
    return ent['git_commit']
  else:
    return None

def get_absolute_svn_url(svnext_url, svn_root_url):
  """
  Convert a relative svn:externals URL to an absolute one.

  If the format is unsupported, return the URL unchanged with success=False.
  If no root URL is given or the URL is absolute already, return it unchanged.

  In all cases, even if returned "unchanged", trailing slashes are removed.
  """
  # Remove trailing slash(es)
  svnext_url = svnext_url.rstrip("/")
  svn_root_url = svn_root_url.rstrip("/")

  # Normalize URLs in relative format
  svn_root_parsed = urlsplit(svn_root_url)
  if svnext_url.startswith(("../", "^/../")): # unsupported
    return (False, svnext_url)
  elif not svn_root_url:
    pass # unchanged
  elif svnext_url.startswith("^/"):
    svnext_url = svn_root_url + svnext_url[1:]
  elif svnext_url.startswith("//"):
    svnext_url = svn_root_parsed.scheme + ":" + svnext_url
  elif svnext_url.startswith("/"):
    svnext_url = svn_root_parsed.scheme + "://" + svn_root_parsed.netloc + svnext_url

  return True, svnext_url

def parse_revision_value(value):
  """
  Parse the value of key 'revision' from a .gitsvnextmodules file and return it
  as integer.

  Used to handle non-numeric values like 1k, 2k, 3k etc. added by SubGit
  instead of 1024, 2048, 3072 etc., likewise 1m, 2m, ..., 1g, ...
  """
  suffix = value[-1]
  if suffix in "kmg":
    mult = {"k": 1024, "m": 1024**2, "g": 1024**3}
    return int(value[0:-1]) * mult[suffix]
  else:
    return int(value)

def add_submodule_tree_entry(commit, parsed_config, section):
  """
  Add a submodule entry to the tree of a Git commit.

  SVN externals information obtained from parsed .gitsvnextmodules file.
  """
  # Skip type=file (SVN file external), not possible as submodule
  if parsed_config[section]['type'] != 'dir':
    return False

  success, svn_url = get_absolute_svn_url(parsed_config[section]['url'], svn_root_url)
  # Skip unsupported URL format
  if not success:
    return False

  # Get SVN revision
  if parsed_config.has_option(section, 'revision'):
    svn_rev = parse_revision_value(parsed_config[section]['revision'])
  else:
    # TODO: revision has to be guessed according to commit timestamp, skip for now
    return False

  # SVN url+revision mapping to Git commit
  git_hash = get_git_commit_hash(svn_url, svn_rev)
  # Skip missing or unusable mapping
  if git_hash is None:
    return False
  git_hash = git_hash.encode()

  dirname = parsed_config[section]['path'].encode()

  # Add gitlink to tree
  commit.file_changes.append(fr.FileChange(b'M', dirname, git_hash, b'160000'))

  return True

def get_commit_map_path():
  """
  Return path to commit-map file.
  """
  git_dir = fr.GitUtils.determine_git_dir(b'.')
  return os.path.join(git_dir, b'filter-repo', b'commit-map')

def parse_commit_map(commit_map_file):
  """
  Parse commit-map file and return a dictionary.
  """
  parsed_map = {}
  with open(commit_map_file, "rb") as f:
    for line in f:
      line = line.rstrip(b'\r\n')

      # Skip blank lines
      if not line:
        continue

      # Store old/new commits, also the "old"/"new" header in the first line
      old, new = line.split()
      parsed_map[old] = new
  return parsed_map

def merge_commit_maps(old_commit_map, new_commit_map):
  """
  Merge old and new commit-map by omitting intermediate commits.

  Return the merged dictionary.
  """
  merged_map = {}
  for (key, old_val) in old_commit_map.items():
    new_val = new_commit_map[old_val] if old_val in new_commit_map else old_val
    merged_map[key] = new_val
  return merged_map

def write_commit_map(commit_map, commit_map_file):
  """
  Write commit-map dictionary to file.
  """
  with open(commit_map_file, 'wb') as f:
    for (old, new) in commit_map.items():
      f.write(b'%-40s %s\n' % (old, new))

def create_report_dir(args):
  """
  Create the directory for analysis report.
  """
  if args.report_dir:
    reportdir = args.report_dir
  else:
    git_dir = fr.GitUtils.determine_git_dir(b'.')

  # Create the report directory as necessary
    results_tmp_dir = os.path.join(git_dir, b'filter-repo')
    if not os.path.isdir(results_tmp_dir):
      os.mkdir(results_tmp_dir)
    reportdir = os.path.join(results_tmp_dir, b'svnexternals')

  if os.path.isdir(reportdir):
    if args.force:
      sys.stdout.write("Warning: Removing recursively: \"%s\"" % fr.decode(reportdir))
      shutil.rmtree(reportdir)
    else:
      sys.stdout.write("Error: dir already exists (use --force to delete): \"%s\"\n" % fr.decode(reportdir))
      sys.exit(1)

  os.mkdir(reportdir)

  return reportdir

analysis = {'dir_ext_orig': [],
            'dir_ext_abs': [],
            'file_ext_orig': [],
            'file_ext_abs': []}
def write_analysis(reportdir):
  """
  Prepare analysis and write it to files in report directory.
  """
  analysis['dir_ext_orig'].sort()
  analysis['dir_ext_abs'].sort()
  analysis['file_ext_orig'].sort()
  analysis['file_ext_abs'].sort()

  sys.stdout.write("Writing reports to %s..." % fr.decode(reportdir))
  sys.stdout.flush()

  with open(os.path.join(reportdir, b"dir-externals-original.txt"), 'wb') as f:
    for url in analysis['dir_ext_orig']:
      f.write(("%s\n" % url).encode())

  with open(os.path.join(reportdir, b"dir-externals-absolute.txt"), 'wb') as f:
    for url in analysis['dir_ext_abs']:
      f.write(("%s\n" % url).encode())

  with open(os.path.join(reportdir, b"file-externals-original.txt"), 'wb') as f:
    for url in analysis['file_ext_orig']:
      f.write(("%s\n" % url).encode())

  with open(os.path.join(reportdir, b"file-externals-absolute.txt"), 'wb') as f:
    for url in analysis['file_ext_abs']:
      f.write(("%s\n" % url).encode())

  sys.stdout.write("done.\n")

def analyze_externals(commit, metadata):
  """
  Generate/extend analysis of SVN externals for a Git commit.

  Used as filter-repo commit callback.
  """
  for change in commit.file_changes:
    if change.filename == b'.gitsvnextmodules' and change.type == b'M':
      gitsvnextmodules = parse_config(change.blob_id)

      for sec in gitsvnextmodules.sections():
        url = gitsvnextmodules[sec]['url']
        success, abs_url = get_absolute_svn_url(url, svn_root_url)

        # List of svn:externals URLs, also add the URL to the absolute list if
        # conversion was not successful
        if gitsvnextmodules[sec]['type'] == 'dir':
          if url not in analysis['dir_ext_orig']:
            analysis['dir_ext_orig'].append(url)
          if abs_url not in analysis['dir_ext_abs']:
            analysis['dir_ext_abs'].append(abs_url)
        else:
          if url not in analysis['file_ext_orig']:
            analysis['file_ext_orig'].append(url)
          if abs_url not in analysis['file_ext_abs']:
            analysis['file_ext_abs'].append(abs_url)

def insert_submodules(commit, metadata):
  """
  Insert submodules for a Git commit.

  Used as filter-repo commit callback.

  Since .gitsvnextmodules just contains the svn:externals state for the given
  commit, we cannot derive specific changes from that file.
  So we can only add/modify the gitlinks according to .gitsvnextmodules
  (without knowing whether adding a new or modifying an existing or even
  "modifying" an unchanged submodule, but none of that really matters).
  We do not have information about deleted externals, those will be handled in
  a separate filter run afterwards.

  The .gitmodules file however will already be correct in this function because
  we don't need to know about specific changes to add, modify or delete it.
  """
  for change in commit.file_changes:
    if change.filename == b'.gitsvnextmodules' and change.type in (b'M', b'D'):
      gitsvnextmodules = parse_config(change.blob_id)
      gitmodules = configparser.ConfigParser()

      # Add gitlinks to the tree and prepare .gitmodules file content
      for sec in gitsvnextmodules.sections():
        if add_submodule_tree_entry(commit, gitsvnextmodules, sec):
          # Gitlink added
          # -> Add this entry to .gitmodules as well

          # Create the section name string manually, do not rely on
          # .gitsvnextmodules to always use the proper section name.
          sec_name = 'submodule "' + gitsvnextmodules[sec]['path'] + '"'
          gitmodules[sec_name] = {}

          # submodule.<name>.path
          gitmodules[sec_name]['path'] = gitsvnextmodules[sec]['path']

          # submodule.<name>.url
          success, svn_url = get_absolute_svn_url(gitsvnextmodules[sec]['url'], svn_root_url)
          git_url = get_git_url(svn_url)
          if git_url is not None:
            gitmodules[sec_name]['url'] = git_url
          else:
            # Abort, but this will not happen in practice, catched in
            # add_submodule_tree_entry() via get_git_commit_hash() already.
            raise SystemExit("Error: No Git URL found in mapping although a commit hash could be found.")

      # Write blob and adapt tree for .gitmodules
      if gitmodules.sections():
        # Create a blob object from the content and add it to the tree.
        blob = create_blob(gitmodules)
        filter.insert(blob)
        commit.file_changes.append(fr.FileChange(b'M', b'.gitmodules', blob.id, b'100644'))
      else:
        # Delete the file, even if a "git rm" of all submodules keeps it empty.
        commit.file_changes.append(fr.FileChange(b'D', b'.gitmodules'))

def delete_submodules(commit, metadata):
  """
  Delete submodules from a Git commit.

  Used as filter-repo commit callback.

  Delete all submodules (inserted in the previous filter run) without an entry
  in .gitsvnextmodules, these were real deletions of externals, which couldn't
  be detected before.
  Only the tree entries have to be removed because the .gitmodules file is
  already in correct state from previous filter run.
  """
  for change in commit.file_changes:
    if change.filename == b'.gitsvnextmodules' and change.type in (b'M', b'D'):
      gitsvnextmodules = parse_config(change.blob_id)

      # Search for all submodules in the tree
      output = subprocess.check_output('git ls-tree -d -r -z'.split() + [commit.original_id])
      for line in output.split(b'\x00'):
        if not line:
          continue
        mode_objtype_objid, dirname = line.split(b'\t', 1)
        mode, objtype, objid = mode_objtype_objid.split(b' ')
        if mode == b'160000' and objtype == b'commit':
          # Submodule found
          # -> Delete it if there is no corresponding entry in
          #    .gitsvnextmodules, keep/reinsert it otherwise
          for sec in gitsvnextmodules.sections():
            if gitsvnextmodules[sec]['path'].encode() == dirname:
              # Reinsert it, might have been deleted in previous commits
              if add_submodule_tree_entry(commit, gitsvnextmodules, sec):
                # And remove the config section because this external has been
                # converted
                gitsvnextmodules.remove_section(sec)
                break
          else:
            # Delete it
            commit.file_changes.append(fr.FileChange(b'D', dirname))

      # Rewrite .gitsvnextmodules to contain the unhandled externals only,
      # delete it if empty (all externals converted).
      if gitsvnextmodules.sections():
        # Create a blob object from the content and replace the original one.
        blob = create_blob(gitsvnextmodules)
        filter.insert(blob)
        change.blob_id = blob.id
      else:
        if change.type == b'M':
          # File became empty, delete it
          commit.file_changes.append(fr.FileChange(b'D', b'.gitsvnextmodules'))
          break # avoid endless for loop
        #else:
          # File was empty already, delete command already present in stream

my_args = parse_args()

# Use passed URL without trailing slash(es)
if my_args.svn_root_url:
  svn_root_url = my_args.svn_root_url.rstrip("/")

# Arguments forwarded to filter-repo
extra_args = []
if my_args.force:
  extra_args = ['--force']
if my_args.refs:
  extra_args += ['--refs'] + my_args.refs

cat_file_process = subprocess.Popen(['git', 'cat-file', '--batch'],
                                    stdin = subprocess.PIPE,
                                    stdout = subprocess.PIPE)
if my_args.analyze:
  # Analysis
  reportdir = create_report_dir(my_args)

  fr_args = fr.FilteringOptions.parse_args(['--dry-run']
                                           + extra_args)
  filter = fr.RepoFilter(fr_args, commit_callback=analyze_externals)
  filter.run()

  write_analysis(reportdir)
else:
  # Conversion
  svn_git_mappings = read_mappings(my_args.svn_git_mapfiles)

  # There are no references to commit hashes in commit messages because this
  # script runs on a Git repository converted from a Subversion repository.
  fr_args = fr.FilteringOptions.parse_args(['--preserve-commit-hashes',
                                            '--preserve-commit-encoding',
                                            '--replace-refs', 'update-no-add']
                                           + extra_args)
  filter = fr.RepoFilter(fr_args, commit_callback=insert_submodules)
  filter.run()

  # Store commit-map after first run
  first_commit_map = parse_commit_map(get_commit_map_path())

  filter = fr.RepoFilter(fr_args, commit_callback=delete_submodules)
  filter.run()

  # Update commit-map after second run, based on original IDs
  second_commit_map = parse_commit_map(get_commit_map_path())
  merged_commit_map = merge_commit_maps(first_commit_map, second_commit_map)
  write_commit_map(merged_commit_map, get_commit_map_path())

cat_file_process.stdin.close()
cat_file_process.wait()
